import requests
import re,os,pymysql
from bs4 import BeautifulSoup


cate_list = []

# 数据爬取
def get_html(url):
    r = requests.get(url, headers=headers)
    r.encoding = "utf-8"
    # print(r.text)
    return r.text


# 数据解析
def parse_data(html_doc):
    soup = BeautifulSoup(html_doc,"html.parser")
    root = "D://CateImg//"
    div_list = soup.select("body > div.main_w.clearfix > article > div.list_s2 > div.list_s2_content > div")
    for div in div_list:
        link = div.select("div.imgw > a.list_s2_item_img")[0].get("href")
        title = div.select("a.list_s2_item_info > strong.title")[0].string
        main_food = div.select("a.list_s2_item_info > span.sc")[0].string
        img = div.select("div.imgw > a.list_s2_item_img")[0].get("style")
        pic_img = img.split('(')[-1].split(')')[0]
        print(pic_img)
        print(title)
        # print(link)
        print(main_food)
        print("-"*30)
        firstDict = {"pic_img":pic_img, "title":title, "main_food":main_food}
        cate_list.append(firstDict)
        print("获取的数据",cate_list)
        url = root + pic_img.split("/")[-1]
        print(url)
        try:
            if not os.path.exists(root):  # 判断磁盘制定文件夹是否存在，
                os.makedirs(root)  # 如果不存在就创建文件夹

            r = requests.get(pic_img)
            print("文件大小", len(r.content) / 1024, "kb")
            with open(url, "wb") as f:
                print("正在保存文件...")
                f.write(r.content)  # 向文件中写入二进制内容
                print("文件保存成功")
        except Exception as e:
            print("爬取失败", e)

# 数据存储
def store_data():
    con = pymysql.connect(host='localhost', port=3306, database="category", user='root', password='123', charset="utf8")
    c = con.cursor()
    try:
        sql = """
            insert into cate_tb values
            (NULL,%s,%s,%s);
        """
        for i in cate_list:
            print(i)
            cate = c.execute(sql,(i['pic_img'], i['title'],i['main_food']))
            print("数据>>", cate)
            con.commit()

    except Exception as e:
        print('错误', e)
        con.rollback()
        con.close()


if __name__ == "__main__":
    url = "https://www.meishij.net/caixi/chuancai/"
    headers = {
        'Accept': '*/*',
        'Accept-Language': 'en-US,en;q=0.8',
        'Cache-Control': 'max-age=0',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.50',
        'Connection': 'keep-alive',
        'Referer': 'https://news.sina.com.cn/'
    }
    html_doc = get_html(url)
    parse_data(html_doc)
    store_data()
